Web Scraping with Python






Chaitanya Tejaswi


22nd January, 2020

Objectives

Assumptions

Motivation for This Talk

“Can I read this on my Kindle?”

The Solution: On Amazon Kindle

The Solution: On Android Device

Observations

“Can I read this on my Kindle?”

How To Do It?

  1. Send an HTTP request to the server for the file.
  2. Get the file, filter out the “title” & “judgement” (summary).
  3. Save this to a text/html file.
  4. Convert this file to an eBook, particularly one that is compatible with Android & Kindle.

But First, Some Prerequisites

Send A Request, Retrieve A File

from urllib import request
...
response = request.urlopen(url).read().decode('utf-8')

Create An HTML object

from bs4 import BeautifulSoup
...
html = BeautifulSoup(response, 'lxml')

Finding <tags>

# Find headline of text
headline = article.h2.a.text

Syntax

.find(tag, attributes, recursive, text, keywords)
.find_all(tag, attributes, recursive, text, limit, keywords)
# [tag] Find all headings in the page
.find_all('h1')
.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

# [attributes] Find all <span> that contain green/red colored text
.find_all('span', {'class': {'green', 'red'}})

# [text] How many times is "Happy Birthday" displayed on the webpage?
.find_all(text='prince')

# [keywords]
.find_all(id='span', class_={'green', 'red'})

Note: OR/AND

.find_all('div', id={'title','summary'}, class_={'green', 'red'})

Observations

“Can I read this on my Kindle?”

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        # [1] <-- Process the links
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        # [2] <-- Automatically open the file
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateHtml(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''start {urlId}.html''', shell=True)
        # [3] <-- Save ebook (epub/mobi)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateHtml(sys.argv[1])

Source Code: Let’s Jump In!

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId):
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8')
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Disclaimer: Don’t Use This In Production Code

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId): # [1]
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [2]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Slightly Better

#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
import subprocess
import sys
from urllib import request, error

urlBase = 'https://indiankanoon.org'


def generateMobi(urlId): # [1]
    url = f'{urlBase}/doc/{urlId}'
    try:
        response = request.urlopen(url).read().decode('utf-8') # [2]
        html = BeautifulSoup(response, 'lxml')
        judgement = html.find('div', class_='judgments')
        title = judgement.find('div', class_='doc_title').text
        content = f'''
        <html>
            <head><title>{title}</title></head>
            <body>{judgement}</body>
        </html>
        '''
        content = re.sub(r'''(href=")([a-zA-Z0-9/]+)"''',
                         fr'''\1{urlBase}\2"''', content)
        with open(f'{urlId}.html', 'w') as f:
            f.write(content)
        subprocess.run(f'''pandoc {urlId}.html --epub-cover-image="resources\supreme_court_india.jpg" -o {urlId}.epub''', shell=True)
        subprocess.run(f'''kindlegen {urlId}.epub''', shell=True)
        subprocess.run(f'''start {urlId}.epub''', shell=True)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)
    return None


if __name__ == '__main__':
    generateMobi(sys.argv[1])

Where Else Can You Use This?

Homework

References

[1] “Web Scraping using Python” by Corey Schafer
[2] “RegEx using Python” by Corey Schafer
[3] “Web Scraping with Python” by Ryan Mitchell
[4] “Legal Aspects” by Data Carpentry